pacman::p_load(tidyverse, plotly, crosstalk, DT, ggdist, gganimate, ggstatsplot, heatmaply)
HDB <- read_csv(("data/HDB.csv"))
#Create addtional data on price per sqm
HDB$price_per_sqm <- (HDB$resale_price / HDB$floor_area_sqm)Take Home Ex 3
#Filter 3Room, 4Room, 5Room
HDBRoom <- HDB %>% filter(flat_type=="3 ROOM" | flat_type=="4 ROOM" | flat_type=="5 ROOM") %>%
separate(month, into = c("year", "month")) %>%
filter(year == "2022") %>%
separate(remaining_lease, into = c("rmlease_years", "rmlease_month"), sep = "years")
HDBRoom$region <- case_when(
HDBRoom$town %in% c("ANG MO KIO", "HOUGANG", "PUNGGOL", "SERANGOON", "SENGKANG") ~ "North-East",
HDBRoom$town %in% c("BISHAN", "BUKIT MERAH", "BUKIT TIMAH", "CENTRAL AREA", "GEYLANG", "KALLANG/WHAMPOA", "MARINE PARADE", "QUEENSTOWN", "TOA PAYOH") ~ "Central",
HDBRoom$town %in% c("BEDOK", "PASIR RIS", "TAMPINES") ~ "East",
HDBRoom$town %in% c("SEMBAWANG", "WOODLANDS", "YISHUN") ~ "North",
HDBRoom$town %in% c("BUKIT BATOK", "BUKIT PANJANG", "CHOA CHU KANG", "CLEMENTI", "JURONG EAST", "JURONG WEST") ~ "West")
HDBRoom$rmlease_years <- as.numeric(HDBRoom$rmlease_years)
HDBRoom$rmlease_month <- gsub("[monthsmonth]", " ", HDBRoom$rmlease_month) %>%
as.numeric(HDBRoom$rmlease_month) / 12
HDBRoom# A tibble: 24,374 × 15
year month town flat_t…¹ block stree…² store…³ floor…⁴ flat_…⁵ lease…⁶
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl>
1 2022 01 ANG MO KIO 3 ROOM 320 ANG MO… 07 TO … 73 New Ge… 1977
2 2022 01 ANG MO KIO 3 ROOM 225 ANG MO… 07 TO … 67 New Ge… 1978
3 2022 01 ANG MO KIO 3 ROOM 331 ANG MO… 07 TO … 68 New Ge… 1981
4 2022 01 ANG MO KIO 3 ROOM 534 ANG MO… 07 TO … 82 New Ge… 1980
5 2022 01 ANG MO KIO 3 ROOM 578 ANG MO… 04 TO … 67 New Ge… 1980
6 2022 01 ANG MO KIO 3 ROOM 452 ANG MO… 01 TO … 83 New Ge… 1979
7 2022 01 ANG MO KIO 3 ROOM 560 ANG MO… 01 TO … 67 New Ge… 1980
8 2022 01 ANG MO KIO 3 ROOM 435 ANG MO… 04 TO … 67 New Ge… 1979
9 2022 01 ANG MO KIO 3 ROOM 435 ANG MO… 04 TO … 67 New Ge… 1979
10 2022 01 ANG MO KIO 3 ROOM 560 ANG MO… 10 TO … 67 New Ge… 1980
# … with 24,364 more rows, 5 more variables: rmlease_years <dbl>,
# rmlease_month <dbl>, resale_price <dbl>, price_per_sqm <dbl>, region <chr>,
# and abbreviated variable names ¹flat_type, ²street_name, ³storey_range,
# ⁴floor_area_sqm, ⁵flat_model, ⁶lease_commence_date
HDBRoom$storey_range <- factor (HDBRoom$storey_range, levels = unique(HDBRoom$storey_range))
ggplot(data = HDBRoom,
aes(x =storey_range)) + geom_bar()
unique(HDBRoom$storey_range) [1] 07 TO 09 04 TO 06 01 TO 03 10 TO 12 13 TO 15 25 TO 27 16 TO 18 19 TO 21
[9] 22 TO 24 28 TO 30 34 TO 36 31 TO 33 37 TO 39 40 TO 42 43 TO 45 49 TO 51
[17] 46 TO 48
17 Levels: 07 TO 09 04 TO 06 01 TO 03 10 TO 12 13 TO 15 25 TO 27 ... 46 TO 48
unique(HDBRoom$town) [1] "ANG MO KIO" "BEDOK" "BISHAN" "BUKIT BATOK"
[5] "BUKIT MERAH" "BUKIT PANJANG" "BUKIT TIMAH" "CENTRAL AREA"
[9] "CHOA CHU KANG" "CLEMENTI" "GEYLANG" "HOUGANG"
[13] "JURONG EAST" "JURONG WEST" "KALLANG/WHAMPOA" "MARINE PARADE"
[17] "PASIR RIS" "PUNGGOL" "QUEENSTOWN" "SEMBAWANG"
[21] "SENGKANG" "SERANGOON" "TAMPINES" "TOA PAYOH"
[25] "WOODLANDS" "YISHUN"
HDBRoom$rmlease_month[is.na(HDBRoom$rmlease_month)] = 0
HDBRoom$rmlease <- as.numeric(HDBRoom$rmlease_years + HDBRoom$rmlease_month)
HDBRoom$storey_range <- str_replace(HDBRoom$storey_range, "TO", "-")
sr_sort = c("01 - 03", "04 - 06", "07 - 09", "10 - 12", "13 - 15", "16 - 18", "19 - 21", "22 - 24","25 - 27","28 - 30", "31 - 33", "34 - 36", "37 - 39", "40 - 42", "43 - 45", "49 - 51", "46 - 48")
HDBRoom$storey_range <- factor (HDBRoom$storey_range, levels = sr_sort)HDBDATA <- HDBRoom [,!names(HDBRoom) %in% c("year", "block", "street_name", "rmlease_years", "rmlease_month", "flat_model")]gghistostats(
data = HDBDATA, x = "floor_area_sqm",
type = "bayes",
test.value = 100,
xlab = "Floor Area (sqm) of property sold"
)
ggbetweenstats(
data = HDBDATA,
x = flat_type,
y = resale_price,
type = "np",
messages = FALSE
)
ggscatterstats(
data = HDBDATA,
x = resale_price,
y = price_per_sqm,
marginal = FALSE,
)
options(scipen = 999)
mean(HDBDATA$resale_price)[1] 536391.2
min(HDBDATA$resale_price)[1] 200000
max(HDBDATA$resale_price)[1] 1418000
mean(HDBDATA$price_per_sqm)[1] 5735.973
min(HDBDATA$price_per_sqm)[1] 3333.333
max(HDBDATA$price_per_sqm)[1] 14731.18
scdata <- highlight_key(HDBDATA)
sc1 <- ggplot(data = scdata, aes(x = town, y = resale_price, fill = region)) + geom_point() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + scale_y_continuous(breaks = c(200000,500000,1000000,150000)) +
labs(title = "Resale Price by Town", x = "Town", y = "Resale Price")
sc2 <- ggplot(data = scdata, aes(x = town, y = price_per_sqm, fill = region)) + geom_point() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + scale_y_continuous(breaks = c(3000,6000,9000,12000,15000)) +
labs(title = "Resale Price per sqm by Town", x = "Town", y = "Resale Price/Sqm")
subplot(ggplotly(sc1), ggplotly(sc2))HDBDATA %>%
mutate(class = fct_reorder(town, price_per_sqm, .fun="mean")) %>%
ggplot(aes(y =reorder(town, price_per_sqm),
x = price_per_sqm, fill = region)) +
geom_boxplot() + stat_summary(fun.y=mean, geom = "point", colour="yellow")
HDBDATA %>%
group_by(region) %>%
mutate(class = fct_reorder(region, price_per_sqm, .fun="mean")) %>%
ggplot(mapping = aes(y = flat_type, x = price_per_sqm)) +
# Make grouped boxplot
geom_boxplot(aes(fill = as.factor(region))) +
theme(legend.position = "top") +
# Adjust lables and add title
labs(title = "HDB resale prices in 2022 by region", y="Flat Type", x = "Price per square metre (SGD)", fill = "flat_type")
HDBDATA %>%
grouped_gghistostats(
x = resale_price,
test.value = 50,
type = "nonparametric",
grouping.var = region,
normal.curve = TRUE,
normal.curve.args = list(color = "red", size = 1),
ggtheme = ggthemes::theme_tufte(),
## modify the defaults from `{ggstatsplot}` for each plot
plotgrid.args = list(nrow = 2),
annotation.args = list(title = "Resale price by region")
)
floorheatmap <-
HDBDATA %>%
group_by(town, storey_range) %>%
summarise(median_price = median(price_per_sqm))
heatmap <- ggplot(data = floorheatmap,
mapping = aes(x = town, y = storey_range, fill = median_price)) +
geom_tile() +
labs(title = "Heatmap of HDB breakdown by area and storey", x = "Town", y = "Storey") +
scale_fill_gradient(name = "Median Resale Price/sqm",
low = "peachpuff",
high = "deeppink4")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
heatmap